In [1]:
# this cell is tagged as parameters for `papermill` parameterization
#input configs
altair_config = None
nipah_config = None
#E2 specific files
func_scores_E2_file = None
binding_E2_file = None
#E3 specific files
func_scores_E3_file = None
binding_E3_file = None
#merged_files
merged_df_file = None
concat_df_file = None
#output plots
output_corr = None
entry_binding_corr_plot_E2_output = None
entry_binding_corr_plot_E3_output = None
corr_entry_binding_large_output = None
combined_binding_output = None
entry_by_site_plot_e2_output = None
entry_by_site_plot_e3_output = None
entry_by_site_plot_e2_bar_plot = None
binding_letter_plot = None
In [2]:
# Parameters
altair_config = "data/custom_analyses_data/interactive_theme.py"
nipah_config = "nipah_config.yaml"
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
merged_df_file = "results/filtered_data/entry/e2_e3_entry_filter_merged.csv"
concat_df_file = "results/filtered_data/entry/e2_e3_entry_filter_concat.csv"
output_corr = "results/images/corr_heatmap.html"
entry_binding_corr_plot_E2_output = "results/images/entry_binding_corr_plot_E2.html"
entry_binding_corr_plot_E3_output = "results/images/entry_binding_corr_plot_E3.html"
corr_entry_binding_large_output = "results/images/corr_entry_binding_large.html"
combined_binding_output = "results/images/combined_binding.html"
entry_by_site_plot_e2_output = "results/images/entry_by_site_plot_e2.html"
entry_by_site_plot_e3_output = "results/images/entry_by_site_plot_e3.html"
entry_by_site_plot_e2_bar_plot = "results/images/entry_by_site_plot_e2_bar_plot.html"
binding_letter_plot = "results/images/binding_letter_plot.html"
Import modules¶
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
Set working directory¶
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
if os.getcwd() == '/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/':
pass
print("Already in correct directory")
else:
os.chdir("/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/")
print("Setup in correct directory")
Setup in correct directory
Setup input file paths for running notebook interactively¶
In [5]:
if nipah_config is None:
#input files
altair_config = 'data/custom_analyses_data/interactive_theme.py'
nipah_config = 'nipah_config.yaml'
func_scores_E2_file = "results/filtered_data/entry/e2_entry_filtered.csv"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
func_scores_E3_file = "results/filtered_data/entry/e3_entry_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
antibody_file = 'results/filtered_data/escape/mab_filter_concat.csv'
merged_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_merged.csv'
concat_df_file = 'results/filtered_data/entry/e2_e3_entry_filter_concat.csv'
Read config files¶
In [6]:
if altair_config:
with open(altair_config, 'r') as file:
exec(file.read())
with open(nipah_config) as f:
config = yaml.safe_load(f)
Import filtered data¶
In [7]:
merged_df = pd.read_csv(merged_df_file) #merged entry scores
display(merged_df.head(3))
concat_df = pd.read_csv(concat_df_file) #concat entry scores
display(concat_df.head(3))
| site | wildtype | mutant | effect_E2 | effect_std_E2 | times_seen_E2 | n_selections_E2 | cell_type_E2 | wildtype_site_E2 | wt_type_E2 | mutant_type_E2 | effect_E3 | effect_std_E3 | times_seen_E3 | n_selections_E3 | cell_type_E3 | wildtype_site_E3 | wt_type_E3 | mutant_type_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.702 | 0.1865 | 4.625 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | special | -0.7227 | 0.7828 | 3.000 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | special |
| 1 | 71 | Q | E | -1.199 | 0.3996 | 5.250 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | negative | -0.2482 | 0.9791 | 4.571 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | negative |
| 2 | 71 | Q | F | -0.947 | 0.6969 | 4.625 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | aromatic | -0.4973 | 0.3080 | 3.286 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | aromatic |
| site | wildtype | mutant | effect | effect_std | times_seen | n_selections | cell_type | wildtype_site | wt_type | mutant_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.702 | 0.1865 | 4.625 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | special |
| 1 | 71 | Q | E | -1.199 | 0.3996 | 5.250 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | negative |
| 2 | 71 | Q | F | -0.947 | 0.6969 | 4.625 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | aromatic |
Merge data and make dataframes for plotting¶
In [8]:
# Read filtered cell entry data
def read_func_data(file,name):
effect_df = pd.read_csv(file)
effect_df = effect_df[['site','wildtype','mutant','effect']]
effect_df['cell_type'] = name
return effect_df
# Call func to read in cell entry data
e2_func_df = read_func_data(func_scores_E2_file, 'CHO-bEFNB2')
e3_func_df = read_func_data(func_scores_E3_file, 'CHO-bEFNB3')
# Read filtered binding data
def read_binding_data(file,name):
binding_df = pd.read_csv(file)
binding_df = binding_df[['site','wildtype','mutant','binding_mean','mutant_type']]
binding_df['cell_type'] = name
return binding_df
# Call func to read in binding data
e2_bind_df = read_binding_data(binding_E2_file,'CHO-bEFNB2')
e3_bind_df = read_binding_data(binding_E3_file,'CHO-bEFNB3')
# Concat binding and func data, then merge
def concat_dfs(bind1,bind2,entry1,entry2):
combo_bind_df = pd.concat([bind1,bind2])
combo_entry_df = pd.concat([entry1,entry2])
total_merged = pd.merge(combo_bind_df,combo_entry_df,on=['site','wildtype','mutant','cell_type'],how='outer')
return total_merged
final_merged_df = concat_dfs(e2_bind_df,e3_bind_df,e2_func_df,e3_func_df)
### Ok, now I have different inputs ready to go for plotting. Lets review
# I have my different entry dataframes
display(e2_func_df.head(2))
display(e3_func_df.head(2))
display(concat_df.head(2))
display(merged_df.head(2))
display(final_merged_df.head(2))
| site | wildtype | mutant | effect | cell_type | |
|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.702 | CHO-bEFNB2 |
| 1 | 71 | Q | E | -1.199 | CHO-bEFNB2 |
| site | wildtype | mutant | effect | cell_type | |
|---|---|---|---|---|---|
| 0 | 71 | Q | C | -0.7227 | CHO-bEFNB3 |
| 1 | 71 | Q | D | -0.3884 | CHO-bEFNB3 |
| site | wildtype | mutant | effect | effect_std | times_seen | n_selections | cell_type | wildtype_site | wt_type | mutant_type | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.702 | 0.1865 | 4.625 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | special |
| 1 | 71 | Q | E | -1.199 | 0.3996 | 5.250 | 8 | CHO-bEFNB2 | Q71 | hydrophilic | negative |
| site | wildtype | mutant | effect_E2 | effect_std_E2 | times_seen_E2 | n_selections_E2 | cell_type_E2 | wildtype_site_E2 | wt_type_E2 | mutant_type_E2 | effect_E3 | effect_std_E3 | times_seen_E3 | n_selections_E3 | cell_type_E3 | wildtype_site_E3 | wt_type_E3 | mutant_type_E3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | C | -1.702 | 0.1865 | 4.625 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | special | -0.7227 | 0.7828 | 3.000 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | special |
| 1 | 71 | Q | E | -1.199 | 0.3996 | 5.250 | 8.0 | CHO-bEFNB2 | Q71 | hydrophilic | negative | -0.2482 | 0.9791 | 4.571 | 7.0 | CHO-bEFNB3 | Q71 | hydrophilic | negative |
| site | wildtype | mutant | binding_mean | mutant_type | cell_type | effect | |
|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | E | 0.1659 | negative | CHO-bEFNB2 | -1.199 |
| 1 | 71 | Q | F | -0.3429 | aromatic | CHO-bEFNB2 | -0.947 |
Now assign RBP region to the dataframe¶
In [9]:
def find_domain(df):
barrel_ranges = {
"Stalk": list(range(70, 148)),
"Neck": list(range(148, 166)),
"Linker": list(range(166, 178)),
"Head": list(range(178, 602)),
}
agg_means = []
# For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
for barrel, sites in barrel_ranges.items():
subset = df[df["site"].isin(sites)]
for _, row in subset.iterrows():
agg_means.append(
{
"site": row["site"],
"wildtype": row["wildtype"],
"mutant": row["mutant"],
"region": barrel,
"binding_mean": row["binding_mean"],
"effect": row['effect'],
"cell_type": row["cell_type"],
"mutant_type": row["mutant_type"]
}
)
agg_means_df = pd.DataFrame(agg_means)
return agg_means_df
# Call function above
binding_entry_concat_df = find_domain(final_merged_df)
display(binding_entry_concat_df)
| site | wildtype | mutant | region | binding_mean | effect | cell_type | mutant_type | |
|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | E | Stalk | 0.16590 | -1.19900 | CHO-bEFNB2 | negative |
| 1 | 71 | Q | F | Stalk | -0.34290 | -0.94700 | CHO-bEFNB2 | aromatic |
| 2 | 71 | Q | G | Stalk | 0.46570 | -1.37400 | CHO-bEFNB2 | special |
| 3 | 71 | Q | H | Stalk | 0.02003 | -0.40780 | CHO-bEFNB2 | positive |
| 4 | 71 | Q | K | Stalk | 0.08932 | -0.05105 | CHO-bEFNB2 | positive |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19398 | 601 | C | F | Head | NaN | -1.66700 | CHO-bEFNB3 | NaN |
| 19399 | 601 | C | G | Head | NaN | -2.04700 | CHO-bEFNB3 | NaN |
| 19400 | 601 | C | I | Head | NaN | -0.75770 | CHO-bEFNB3 | NaN |
| 19401 | 601 | C | P | Head | NaN | -1.52300 | CHO-bEFNB3 | NaN |
| 19402 | 601 | C | V | Head | NaN | 0.01403 | CHO-bEFNB3 | NaN |
19403 rows × 8 columns
Make a site-averaged dataframe of cell entry and binding¶
In [10]:
### Make a dataframe with the averaged values of mutants at each site
tmp_df = binding_entry_concat_df.groupby(['site','cell_type'])[['effect','binding_mean']].mean().reset_index()
subset_df = binding_entry_concat_df.drop_duplicates(['site','wildtype','region'])
mean_df = pd.merge(tmp_df,subset_df[['site','wildtype','region']],on='site',how='left')
display(mean_df.head(5))
| site | cell_type | effect | binding_mean | wildtype | region | |
|---|---|---|---|---|---|---|
| 0 | 71 | CHO-bEFNB2 | -1.145872 | 0.146081 | Q | Stalk |
| 1 | 71 | CHO-bEFNB3 | -0.616870 | -0.111348 | Q | Stalk |
| 2 | 72 | CHO-bEFNB2 | -1.221033 | 0.035927 | N | Stalk |
| 3 | 72 | CHO-bEFNB3 | -0.759448 | -0.103486 | N | Stalk |
| 4 | 73 | CHO-bEFNB2 | -0.736845 | 0.130661 | Y | Stalk |
Make a pivot table for plotting certain data¶
In [11]:
display(binding_entry_concat_df)
| site | wildtype | mutant | region | binding_mean | effect | cell_type | mutant_type | |
|---|---|---|---|---|---|---|---|---|
| 0 | 71 | Q | E | Stalk | 0.16590 | -1.19900 | CHO-bEFNB2 | negative |
| 1 | 71 | Q | F | Stalk | -0.34290 | -0.94700 | CHO-bEFNB2 | aromatic |
| 2 | 71 | Q | G | Stalk | 0.46570 | -1.37400 | CHO-bEFNB2 | special |
| 3 | 71 | Q | H | Stalk | 0.02003 | -0.40780 | CHO-bEFNB2 | positive |
| 4 | 71 | Q | K | Stalk | 0.08932 | -0.05105 | CHO-bEFNB2 | positive |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 19398 | 601 | C | F | Head | NaN | -1.66700 | CHO-bEFNB3 | NaN |
| 19399 | 601 | C | G | Head | NaN | -2.04700 | CHO-bEFNB3 | NaN |
| 19400 | 601 | C | I | Head | NaN | -0.75770 | CHO-bEFNB3 | NaN |
| 19401 | 601 | C | P | Head | NaN | -1.52300 | CHO-bEFNB3 | NaN |
| 19402 | 601 | C | V | Head | NaN | 0.01403 | CHO-bEFNB3 | NaN |
19403 rows × 8 columns
In [12]:
#Now make a pivot table for some graphs
df_pivot = binding_entry_concat_df.pivot_table(index=['region', 'site', 'wildtype','mutant'],
columns='cell_type',
values=['effect', 'binding_mean','mutant_type'],
aggfunc='first').reset_index()
# Flatten df
df_pivot.columns = ['_'.join(col).strip() if col[1] else col[0] for col in df_pivot.columns.values]
# rename
df_pivot.rename(columns={
'effect_CHO-bEFNB2': 'effect_E2',
'effect_CHO-bEFNB3': 'effect_E3',
'binding_mean_CHO-bEFNB2': 'binding_E2',
'binding_mean_CHO-bEFNB3': 'binding_E3'
}, inplace=True)
display(df_pivot.sort_values(by='site').head(5))
| region | site | wildtype | mutant | binding_E2 | binding_E3 | effect_E2 | effect_E3 | mutant_type_CHO-bEFNB2 | mutant_type_CHO-bEFNB3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 8517 | Stalk | 71 | Q | L | NaN | -0.03759 | NaN | -0.1305 | NaN | hydrophobic |
| 8512 | Stalk | 71 | Q | F | -0.3429 | -0.03982 | -0.947 | -0.4973 | aromatic | aromatic |
| 8511 | Stalk | 71 | Q | E | 0.1659 | -0.35270 | -1.199 | -0.2482 | negative | negative |
| 8510 | Stalk | 71 | Q | D | NaN | -0.29370 | NaN | -0.3884 | NaN | negative |
| 8513 | Stalk | 71 | Q | G | 0.4657 | -0.04107 | -1.374 | -1.3310 | special | special |
Make plots¶
Make heatmap of correlations between entry in CHO-bEFNB2 and CHO-bEFNB3¶
In [13]:
def correlation_heatmap(df):
chart = (
alt.Chart(df,title=alt.Title('Effects of RBP Mutations on Cell Entry',subtitle='Between CHO cells expressing bat EFNB2 or EFNB3'))
.mark_rect()
.encode(
alt.X("effect_E2", title="RBP mutant entry in CHO-bEFNB2",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
alt.Y("effect_E3", title="RBP mutant entry in CHO-bEFNB3",axis=alt.Axis(values=[-4,-3,-2,-1,0,1])).bin(maxbins=75),
alt.Color('count():Q',title='Count').scale(type='log'),
tooltip=['count()'],
)
).properties(
height=400,
width=400,
).configure_legend(
padding=2,
orient='top-left', #"left", "right", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right", "none"
labelFontSize=16,
titlePadding=2,
symbolSize=100,
)
return chart
corr_heatmap = correlation_heatmap(merged_df)
corr_heatmap.display()
if entry_by_site_plot_e3_output is not None:
corr_heatmap.save(output_corr)
Make interactive plot linking individual binding and entry effects with top 10 summed binding and entry¶
In [14]:
def plot_entry_binding_interactive(df,name):
#find contact sites
df_copy = df.copy()
df_copy.loc[:, 'is_contact'] = df_copy['site'].isin(config['contact_sites'])
# make a brush for interactivity
brush = alt.selection_interval()
#scatter plot
chart = alt.Chart(df_copy).mark_point(filled=True,size=50).encode(
alt.X("effect", title="Cell Entry", axis=alt.Axis(values=[-2,-1,0,1])),
alt.Y("binding_mean", title="Binding", axis=alt.Axis(values=[-4,-2,0,2])),
color=alt.condition(brush, 'is_contact', alt.value('lightgray')),
tooltip=["site", "wildtype", "mutant", "binding_mean","effect"]
).add_params(
brush
).properties(
width=400,
height=400
)
# Create a bar chart showing the sum of binding_median values for the top 10 sites filtered by the selection.
bars_binding = alt.Chart(df_copy).transform_filter(
brush # Apply the selection filter to include only selected data.
).transform_aggregate(
binding_aggr='sum(binding_mean)', # Aggregate data by summing up binding for selected sites
groupby=['site', 'is_contact']
).transform_window(
rank='rank(binding_aggr)', # Rank sites based on the aggregated sum.
sort=[alt.SortField('binding_aggr', order='descending')] # Sort by descending order of sum.
).transform_filter(
alt.datum.rank <= 10 # Filter to keep only the top 10 ranked sites.
).mark_bar().encode(
alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)), # Encode site names on x-axis with custom sorting and label angle.
alt.Y('binding_aggr:Q', title='Binding'), # Encode aggregated sum on y-axis.
color=alt.Color('is_contact', title='Receptor Contact Site') # Color bars based on whether they are contact sites.
).properties(
width=200,
height=50
)
# Similar to the bars chart for binding_median, but aggregates and ranks sites based on the 'effect' value.
bars_effect = alt.Chart(df_copy,title='Top 10').transform_filter(
brush
).transform_aggregate(
effect_aggr='sum(effect)',
groupby=['site', 'is_contact']
).transform_window(
rank='rank(effect_aggr)',
sort=[alt.SortField('effect_aggr', order='descending')]
).transform_filter(
alt.datum.rank <= 10
).mark_bar().encode(
x=alt.X('site:N', sort='-y', title='Site', axis=alt.Axis(labelAngle=-90)),
y=alt.Y('effect_aggr:Q', title='Entry'),
color=alt.Color('is_contact', title='Receptor Contact Site')
).properties(
width=200,
height=50
)
# Combine the scatter plot with the two bar charts (stacked vertically and placed side by side).
combined_chart = chart & (bars_effect | bars_binding)
combined_chart = combined_chart.properties(
title={
"text": f"Correlation of Cell Entry and Binding for {name}",
"subtitle": ["Draw box in scatterplot to show the top 10 sites by summed binding or entry"],
}
)
return combined_chart
In [15]:
entry_binding_corr_plot_E2 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')
entry_binding_corr_plot_E2.display()
if entry_by_site_plot_e3_output is not None:
entry_binding_corr_plot_E2.save(entry_binding_corr_plot_E2_output)
Now do the same above for for EFNB3¶
In [16]:
entry_binding_corr_plot_E3 = plot_entry_binding_interactive(final_merged_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')
entry_binding_corr_plot_E3.display()
if entry_by_site_plot_e3_output is not None:
entry_binding_corr_plot_E3.save(entry_binding_corr_plot_E3_output)
Make correlation plots for entry and binding for both efnb2 and efnb3 colored by protein region¶
In [17]:
def correlation_plot(df):
options = ['Stalk', 'Neck', 'Linker','Head']
labels = [option + ' ' for option in options]
df = df.round(2)
#setup interactivity
input_dropdown = alt.binding_radio(
options=options + [None],
labels=labels + ['All'],
name='Region: '
)
selection = alt.selection_point(
fields=['region'],
bind=input_dropdown,
)
color = alt.condition(
selection,
alt.Color('region:N',scale=alt.Scale(domain=options),title='Region'),
alt.value('lightgray'),
)
opacity = alt.condition(
selection,
alt.value(1),
alt.value(0.5)
)
#make chart
effect_chart = (
alt.Chart(df,title='Cell entry')
.mark_point(size=30,opacity=1,filled=True)
.encode(
alt.X("effect_E2", title="Entry in CHO-bEFNB2",axis=alt.Axis(tickCount=4)),
alt.Y("effect_E3", title="Entry in CHO-bEFNB3",axis=alt.Axis(tickCount=4)),
tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3','binding_E2','binding_E3'],
opacity=opacity,
color=color,
)
).properties(
height=350,
width=350
)
binding_chart = (
alt.Chart(df,title='Receptor Binding')
.mark_point(size=30,opacity=1,filled=True)
.encode(
alt.X("binding_E2", title="bEFNB2 Binding",axis=alt.Axis(tickCount=4)),
alt.Y("binding_E3", title="bEFNB3 Binding",axis=alt.Axis(tickCount=4)),
tooltip=['wildtype',"site", "mutant",'effect_E2','effect_E3','binding_E2','binding_E3'],
color=color,
opacity=opacity
)
).properties(
height=350,
width=350
)
#combine charts
combined_chart = effect_chart | binding_chart
combined_chart=combined_chart.add_params(selection).properties(
title=alt.Title('Correlations Between Entry and Binding for bEFNB2 and bEFNB3',
subtitle=['Select radio button to see mutants highlighted or hover over points to see more information'])
)
return combined_chart
In [18]:
# Call function above and save
corr_entry_binding_large = correlation_plot(df_pivot)
corr_entry_binding_large.display()
if entry_by_site_plot_e3_output is not None:
corr_entry_binding_large.save(corr_entry_binding_large_output)
Make figures showing only binding¶
In [19]:
def make_custom_binding_figure(df,name):
brush = alt.selection_interval() #define selection brush
custom_order = ["Stalk", "Neck", "Linker", "Head"]
chart = alt.Chart(df,title=alt.Title(f'{name}')).mark_point(opacity=0.3,filled=True).encode(
alt.X(
"binding_mean",
title=f"Binding",
axis=alt.Axis(tickCount=4),
),
alt.Y(
"region:O",
sort=custom_order,
title="RBP Region",
),
yOffset="random:Q",
tooltip=["region", "binding_mean", "site", "mutant"],
color=alt.condition(brush, 'region', alt.value('lightgray')),
).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())").properties(height=200,width=400).add_params(brush)
bars = alt.Chart(df).transform_filter(
brush
).transform_aggregate(
binding_aggr='sum(binding_mean)',
groupby=['site', 'region']
).transform_window(
rank='rank(binding_aggr)',
sort=[alt.SortField('binding_aggr', order='descending')]
).transform_filter(
alt.datum.rank <= 10
).mark_bar().encode(
y=alt.Y('binding_aggr:Q',title='Binding'),
x=alt.X('site:N', sort='-y',title='Site'),
color=alt.Color('region',title='Region')
).properties(height=50,width=400)
combined_chart = chart & bars
combined_chart
return combined_chart
In [20]:
#make individual bEFNB2 plot
efnb2_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB2"'),'bEFNB2')
#efnb2_binding_region.display()
#make individual bEFNB3 plot
efnb3_binding_region = make_custom_binding_figure(binding_entry_concat_df.query('cell_type == "CHO-bEFNB3"'),'bEFNB3')
#efnb3_binding_region.display()
# combine the plots
combined_binding = (efnb2_binding_region | efnb3_binding_region).properties(
title=alt.Title('Receptor Binding by RBP mutant',
subtitle='Draw boxes around scatter plots to see sites with the top 10 summed mutant binding')
)
combined_binding.display()
if entry_by_site_plot_e3_output is not None:
combined_binding.save(combined_binding_output)
Make interactive plots of average effects of mutants by site¶
In [21]:
def entry_by_site(df):
df = df.round(2)
custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
chart = (
alt.Chart(df)
.mark_bar(opacity=1,stroke='black')
.encode(
alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y("effect", title="Mean entry"),
tooltip=['wildtype',"site", "effect","region"],
color=alt.Color('region',sort=custom_order,title='Region'),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
)
).properties(
width=800,
height=150
)
combined_chart = chart.properties(
title=alt.Title('Average Cell Entry of Mutants at Each Site',
subtitle=['Hover mouse over bars to view information about cell entry'])
).add_params(variant_selector)
return combined_chart
entry_by_site_plot = entry_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')
In [22]:
def binding_by_site(df):
#Site 500 is filtered out so I need to add a dummy value so it shows up on axis:
data = {
'site': [500],
'cell_type': ['CHO-bEFNB2'],
'effect': [0],
'binding_mean': [0],
'wildtype': [None], # Use None for missing values, or '' for an empty string
'region': ['Head']
}
# Create DataFrame
dummy_data = pd.DataFrame(data)
df = pd.concat([df, dummy_data])
df = df.round(2)
custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
chart = (
alt.Chart(df)
.mark_bar(opacity=1,stroke='black')
.encode(
alt.X("site:N", title='Site',axis=alt.Axis(labelAngle=-90,values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y("binding_mean", title="Mean binding"),
tooltip=['wildtype',"site", "effect","region"],
color=alt.Color('region',sort=custom_order,title='Region'),
strokeWidth=alt.condition(
variant_selector, alt.value(1), alt.value(0)
),
row=alt.Row('cell_type',title=None,header=alt.Header(labelFontSize=16,labelFontWeight='bold'))
)
).properties(
width=800,
height=150
)
combined_chart = chart.properties(
title=alt.Title('Average Receptor Binding of Mutants at Each Site',
subtitle=['Hover mouse over bars to view information about cell entry'])
).add_params(variant_selector).resolve_scale(y='independent')
return combined_chart
In [23]:
#call function above
entry_by_site_plot = binding_by_site(mean_df)
entry_by_site_plot.display()
#entry_by_site_plot.save('results/images/entry_by_site_plot.html')
Make interactive chart similar to above but also show individual mutations in heatmaps¶
In [24]:
def entry_by_site(df,name,effect):
# make an empty dataframe with every possible mutation so empty values still get plotted
amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
sites = range(71, 603)
data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
empty_df = pd.DataFrame(data)
full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
#setup interactive features
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
#make base chart
base = alt.Chart(full_df).add_params(variant_selector)
#add bar chart of cell entry by site
chart = base.mark_bar(opacity=1,stroke='black').encode(
alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y(f"mean({effect})", title="Mean entry"),
tooltip=["site", "wildtype","region"],
opacity=alt.condition(variant_selector,alt.value(1),alt.value(0.7)),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
color=alt.Color('region',title='Region')
).properties(
width=800,
height=200
)
#add heatmap below showing effect of mutants on cell entry
select_bar = alt.Chart(full_df).mark_bar(stroke='black').encode(
alt.X('mutant:N',title=None,scale=alt.Scale(domain=amino_acid_order)),
color=alt.Color(f'{effect}',legend=alt.Legend(orient='right',direction='horizontal',titleAlign='center',titleAnchor='middle'),title='Cell entry',scale=alt.Scale(scheme='redblue',domainMid=0,domain=[-4,2]))
).transform_filter(
variant_selector
).properties(
width=400,
height=10
)
#make heatmap have an x for wildtype residue
select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=10, align="center", baseline="middle").encode(
alt.X('wildtype:N',title='Amino acid'),
).transform_filter(
variant_selector
).transform_filter(
(alt.datum[effect] != None) #filter out empty data
).properties(
width=400,
height=10
)
# combine heatmap plots
combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')
# combine the bar and heatmaps
combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
combined_chart = combined_chart.properties(
title=alt.Title(f'Entry in {name}',
subtitle=['Hover over sites to see information about entry of specific mutations'])
)
return combined_chart
In [25]:
# call chart function
entry_by_site_plot_e2 = entry_by_site(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2.display()
if entry_by_site_plot_e3_output is not None:
entry_by_site_plot_e2.save(entry_by_site_plot_e2_output)
In [26]:
# call chart function
entry_by_site_plot_e3 = entry_by_site(df_pivot,'CHO-bEFNB3','effect_E3')
entry_by_site_plot_e3.display()
if entry_by_site_plot_e3_output is not None:
entry_by_site_plot_e3.save(entry_by_site_plot_e3_output)
In [27]:
combined_entry_by_site = (entry_by_site_plot_e2 & entry_by_site_plot_e3)
combined_entry_by_site.display()
TESTING. Stuff below work in progress¶
In [28]:
def entry_by_site_test(df,name,effect):
# make an empty dataframe with every possible mutation so empty values still get plotted
amino_acid_order = ["R","K","H","D","E","Q","N","S","T","Y","W","F","A","I","L","M","V","G","P","C"]
sites = range(71, 603)
data = [{"site": site, "mutant": aa} for site in sites for aa in amino_acid_order]
empty_df = pd.DataFrame(data)
full_df = pd.merge(empty_df,df,on=['site','mutant'],how='left')
#setup interactive features
variant_selector = alt.selection_point(
on="mouseover", empty=False,nearest=True, fields=["site"], value=1
)
#make base chart
base = alt.Chart(full_df).add_params(variant_selector)
#add bar chart of cell entry by site
chart = base.mark_bar(opacity=1,stroke='black').encode(
alt.X("site:N", title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600],tickCount=11,grid=True)),
alt.Y(f"mean({effect})", title="Mean entry"),
tooltip=["site", "wildtype","region"],
opacity=alt.condition(variant_selector,alt.value(1),alt.value(1)),
strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
color=alt.Color('region',title='Region')
).properties(
width=800,
height=200
)
select_bar = alt.Chart(full_df).mark_bar(stroke='black', color='darkgray').encode(
alt.X('mutant:N', title=None, scale=alt.Scale(domain=amino_acid_order)), # Removed sort='-y' here
alt.Y(f'{effect}', scale=alt.Scale(domain=[-4, 2]), title='Cell entry'),
).transform_filter(
variant_selector
).transform_window(
rank='rank()',
sort=[alt.SortField(f'{effect}', order='descending')] # Sort based on the effect values in descending order
).properties(
width=800,
height=100
)
#make heatmap have an x for wildtype residue
select_bar_wildtype = alt.Chart(full_df).mark_text(color="black", text="X", size=14, align="center", baseline="middle").encode(
alt.X('wildtype:N',title='Amino acid'),
).transform_filter(
variant_selector
).transform_filter(
(alt.datum[effect] != None) #filter out empty data
).properties(
width=800,
height=100
)
# combine heatmap plots
combined_effects_w_wildtype = alt.layer(select_bar_wildtype,select_bar).resolve_scale(x='shared')
# combine the bar and heatmaps
combined_chart = alt.vconcat(chart,combined_effects_w_wildtype).resolve_scale(y='independent',x='independent')
combined_chart = combined_chart.properties(
title=alt.Title(f'Entry in {name}',
subtitle=['Hover over sites to see information about entry of specific mutations'])
)
return combined_chart
entry_by_site_plot_e2_bar = entry_by_site_test(df_pivot,'CHO-bEFNB2','effect_E2')
entry_by_site_plot_e2_bar.display()
if entry_by_site_plot_e3_output is not None:
entry_by_site_plot_e2_bar.save(entry_by_site_plot_e2_bar_plot)
In [29]:
def make_effect_by_site_with_hover_tooltip(df):
tmp_df = df.groupby(['cell_type','site'])['effect'].mean().reset_index().round(2)
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_point(nearest=True, on='mouseover',
fields=['site'], empty=False)
# The basic line
line = alt.Chart(tmp_df).mark_line(interpolate='basis',size=1).encode(
alt.X('site:Q', title='Site',axis=alt.Axis(values=[100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600])),
alt.Y('effect:Q',title='Mean entry'),
color=alt.Color('cell_type:N',title='Cell type')
)
# Transparent selectors across the chart. This is what tells us
# the x-value of the cursor
selectors = alt.Chart(tmp_df).mark_point().encode(
alt.X('site:Q'),
opacity=alt.value(0),
).add_params(
nearest
)
# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5,fontSize=15).encode(
text=alt.condition(nearest, 'effect:Q', alt.value(' ')),
#color=alt.value('black')
)#.transform_filter(alt.datum.cell_type == 'CHO-EFNB2')
# Draw a rule at the location of the selection
rules = alt.Chart(tmp_df).mark_rule(color='gray').encode(
x='site:Q',
).transform_filter(
nearest
)
# Put the five layers into a chart and bind the data
combined_chart = alt.layer(
line, selectors, points, rules, text
).properties(
width=800, height=200
)
return combined_chart
alt_plot = make_effect_by_site_with_hover_tooltip(binding_entry_concat_df)
alt_plot.display()
In [30]:
def plot_affinity_individual_mutants(df,mutant):
df = df.dropna().round(2).copy()
x_max = df['binding_E3'].max()
print(x_max)
selector = alt.selection_point(
name="SelectorName",
fields=['site'],
bind=alt.binding_range(min=185,max=602,step=1,name='Site'),
value=[{'site': 492}]
)
chart = (alt.Chart(df)
.mark_text(size=20)
.encode(
alt.X("binding_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-4.5,2])),
alt.Y("binding_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4),scale=alt.Scale(domain=[-2,2])),
alt.Text('mutant'),
alt.Color('mutant_type_CHO-bEFNB2',title='Mutant type'),
tooltip=['site','wildtype','binding_E2','binding_E3','effect_E2','effect_E3'],
)
).add_params(
selector
).transform_filter(
selector
).properties(
title='Correlation of effects of amino acid mutations in RBP head on binding to bEFNB2 or bEFNB3',
height=400,
width=400
)
# Vertical line at x=0
vline = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(color='gray',opacity=0.5).encode(x='x:Q')
# Horizontal line at y=0
hline = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(color='gray',opacity=0.5).encode(y='y:Q')
# Combine the scatter plot with the vertical and horizontal lines
final_chart = chart + vline + hline
return final_chart
test_plot = plot_affinity_individual_mutants(df_pivot,306)
test_plot.display()
test_plot.save(binding_letter_plot)
1.26
In [31]:
display(df_pivot)
| region | site | wildtype | mutant | binding_E2 | binding_E3 | effect_E2 | effect_E3 | mutant_type_CHO-bEFNB2 | mutant_type_CHO-bEFNB3 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Head | 178 | V | A | 0.7066 | 0.008861 | -0.28470 | 0.01306 | hydrophobic | hydrophobic |
| 1 | Head | 178 | V | C | 0.1814 | 0.451400 | 0.07725 | 0.47640 | special | special |
| 2 | Head | 178 | V | D | NaN | -0.041930 | -2.04100 | -1.03800 | None | negative |
| 3 | Head | 178 | V | E | NaN | 0.142800 | -1.74700 | -0.41900 | None | negative |
| 4 | Head | 178 | V | F | 0.5869 | 0.039550 | -1.01900 | -0.34260 | aromatic | aromatic |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9907 | Stalk | 147 | K | S | 0.1344 | -0.060950 | 0.14910 | 0.13650 | hydrophilic | hydrophilic |
| 9908 | Stalk | 147 | K | T | 1.0700 | -0.052750 | -0.43670 | -0.79560 | hydrophilic | hydrophilic |
| 9909 | Stalk | 147 | K | V | NaN | 0.086850 | -1.94400 | -1.02500 | None | hydrophobic |
| 9910 | Stalk | 147 | K | W | NaN | NaN | -2.90900 | -2.27500 | None | None |
| 9911 | Stalk | 147 | K | Y | NaN | NaN | -2.96300 | -1.39500 | None | None |
9912 rows × 10 columns
In [ ]: